
##############################################################################################################3
#  LASSO
##############################################################################################################3
import pandas as pd
from pylab import *
import matplotlib.pyplot as plt
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
import seaborn as sns
import numpy as np
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LassoCV
from sklearn.cross_validation import StratifiedKFold

# to nicely display dataframe in the console
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 250)

#####################################################################################################################
# Importing data
path_data = '/home/emd/Dropbox/Night Lights Validation (Eugenie, Ryan, Johannes)/Data/Python_code_merging/'
path_figure = '/home/emd/Dropbox/Night Lights Validation (Eugenie, Ryan, Johannes)/Manuscript/Figures/'
myfile = pd.read_stata(os.path.join(path_data, 'Data_Merged_8_13.dta'))


#####################################################################################################################
# Creating interesting variables
myfile["electrified_hh_nbr_log"] = np.log(myfile["electrified_hh_nbr"]+1)
myfile["SH_sum_11_log"] = np.log(myfile["SH_sum_11"]+1)

myfile["SH_sum_11"].describe()
myfile["SH_sum_11_log"].describe()

# myfile['SH_sum_11'].hist()

myfile[myfile["SH_sum_11"] > 5000]['SH_sum_11'].count()

# myfile["SH_sum_11_log"].hist()


# We will run different LASSO models
# model 1 log no controls: y=electrified_hh_nbr_log, X=SH_sum_11_log and polynomials
# model 2 no log no controls
# model 3 log controls
# model 4 no log controls

#####################################################################################################################
# Keeping only useful variables
var_to_keep = ['electrified_hh_nbr_log', 'SH_sum_11_log', 'electrified_hh_nbr', 'SH_sum_11']
data = myfile[var_to_keep]
data = data.dropna()

data["SH_sum_11"].describe()
data["SH_sum_11_log"].describe()
data["electrified_hh_nbr"].describe()
data["electrified_hh_nbr_log"].describe()


data_log = data.rename(columns={'electrified_hh_nbr_log': 'y', 'SH_sum_11_log': 'x'})
data_nolog = data.rename(columns={'electrified_hh_nbr': 'y', 'SH_sum_11': 'x'})
data_log_pos = data_log[data_log['x'] > 0]

lin_space_notlog, lin_space_log, lin_space_logpos = pd.DataFrame(None), pd.DataFrame(None), pd.DataFrame(None)
lin_space_notlog['x'] = np.linspace(data_nolog["x"].min(), data_nolog["x"].max(), 300, endpoint=True)
lin_space_log['x'] = np.linspace(data_log["x"].min(), data_log["x"].max(), 300, endpoint=True)
lin_space_logpos['x'] = np.linspace(data_log_pos["x"].min(), data_log_pos["x"].max(), 300, endpoint=True)

for data in [data_log, data_nolog, lin_space_log, lin_space_notlog, lin_space_logpos]:
    for i in range(1, 10):
        colname = 'x_{}'.format(i/10)
        data[colname] = data['x'].pow(i/10)
    for i in range(2, 16):
        colname = 'x_{}'.format(i)
        data[colname] = data['x'].pow(i)


# Initialize predictors to all 15 powers of x
predictors = ['x']
for i in range(1, 10):
    predictors.append('x_{}'.format(i/10))
for i in range(2, 16):
    predictors.append('x_{}'.format(i))


# creating label for stratification. stratification will be useful for cross validation
# for log data: data_log
data_log.loc[data_log[data_log['y'] == 0].index, 'strat'] = 1
data_log.loc[data_log[(data_log['y'] > 0) & (data_log['y'] < 2)].index, 'strat'] = 2
data_log.loc[data_log[(data_log['y'] >= 2) & (data_log['y'] < 3)].index, 'strat'] = 3
data_log.loc[data_log[(data_log['y'] >= 3) & (data_log['y'] < 4)].index, 'strat'] = 4
data_log.loc[data_log[(data_log['y'] >= 4) & (data_log['y'] < 5)].index, 'strat'] = 5
data_log.loc[data_log[(data_log['y'] >= 5) & (data_log['y'] < 6)].index, 'strat'] = 6
data_log.loc[data_log[(data_log['y'] >= 6)].index, 'strat'] = 8
# data_log['strat'].hist()

# for without log data:
data_nolog.loc[data_nolog[data_nolog['y'] == 0].index, 'strat'] = 1
data_nolog.loc[data_nolog[(data_nolog['y'] > 0) & (data_nolog['y'] < 25)].index, 'strat'] = 2
data_nolog.loc[data_nolog[(data_nolog['y'] >= 25) & (data_nolog['y'] < 50)].index, 'strat'] = 3
data_nolog.loc[data_nolog[(data_nolog['y'] >= 50) & (data_nolog['y'] < 100)].index, 'strat'] = 4
data_nolog.loc[data_nolog[(data_nolog['y'] >= 100) & (data_nolog['y'] < 200)].index, 'strat'] = 5
data_nolog.loc[data_nolog[(data_nolog['y'] >= 200) & (data_nolog['y'] < 400)].index, 'strat'] = 6
data_nolog.loc[data_nolog[(data_nolog['y'] >= 400)].index, 'strat'] = 7
# data_nolog['strat'].hist()

data_log_all = data_log
data_nolog_all = data_nolog
data_log_pos = data_log[data_log['x'] > 0]
data_nolog_pos = data_nolog[data_nolog['x'] > 0]


# 1) Plotting predicted values for normal regressions, without lasso
############  LOG ALL for linear regression (alpha = 0)
data = data_log_all
linreg1 = LinearRegression(normalize=True)
linreg1.fit(data[predictors], data['y'])
y_pred = linreg1.predict(data[predictors])
rss = sum((y_pred - data['y'])**2)
mse_alpha0 = mean((y_pred - data['y'])**2)
coeff_alpha0 = linreg1.coef_

############  LOG ALL for linear regression (alpha = 0) with only x as regressors
data = data_log_all
data['k'] = 1
linreg2 = LinearRegression(normalize=True)
linreg2.fit(data[['x', 'k']], data['y'])
y_pred = linreg2.predict(data[['x', 'k']])
rss = sum((y_pred - data['y'])**2)
mse_lin = mean((y_pred - data['y'])**2)
coeff_lin = linreg2.coef_

# plotting the predicted values
lin_space_log['k'] = 1
fig, ax = plt.subplots()
ax.plot(lin_space_log['x'], linreg1.predict(lin_space_log[predictors]), color='seagreen', marker='o',
        linestyle='None', markersize=3, label='Predicted, model with polynomials')
ax.plot(lin_space_log['x'], linreg2.predict(lin_space_log[['x', 'k']]), color='red', marker='o',
        linestyle='None', markersize=3, label='Predicted, x only as regressor')
plt.xlabel('Village night lights (shape, log of sum, 2011)')
plt.ylabel('Number of households electrified in village (in log)')
plt.scatter(data['x'], data['y'], c='cornflowerblue', s=5, alpha=0.5, label='Observations')
plt.title('Predicted values from simple linear regressions')
plt.show()
xmin, xmax = -2, 10
plt.xlim(xmin, xmax)
ymin, ymax = -2, 12
plt.ylim(ymin, ymax)
plt.legend(loc=2)
plt.savefig(os.path.join(path_figure, 'lasso_CV_log_all_walpha_0.png'))


# 2) LASSO
# parameteres for LassoCV: coordinate descent
powers = list(range(-100, 0, 1)) + list(range(0, 55, 1))
alphas = [10 ** (k / 5) for k in powers]
dict_coeff = {}
alpha = {}

############  LOG ALL 10^-10 to 10^10
# model
data = data_log_all
print('log_all')
print(data.columns.values)
skf = StratifiedKFold(data['strat'], n_folds=100, shuffle=True)
print("Computing regularization path using the coordinate descent lasso...")
model = LassoCV(cv=skf, alphas=alphas).fit(data[predictors], data['y'])
print("Model computed.")
dict_coeff['log_all10'] = model.coef_
alpha['log_all10'] = model.alpha_

# printing the mse path
m_log_alphas = np.log10(model.alphas_)
plt.figure()
plt.plot(m_log_alphas, model.mse_path_, ':')
plt.plot(m_log_alphas, model.mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2)
plt.axvline(np.log10(model.alpha_), linestyle='--', color='k', label='alpha: CV estimate')
plt.legend(loc=0)
plt.xlabel('log(alpha)')
plt.ylabel('Mean square error')
plt.title('Selecting alpha with cross-validation: Mean square error on each fold')
plt.axis('tight')
ymin, ymax = 2, 5
plt.ylim(ymin, ymax)
plt.savefig(os.path.join(path_figure, 'lasso_CV_MSE_path.png'))


data_log_all_mse = pd.DataFrame(model.mse_path_)
data_log_all_mse.index = model.alphas_
data_log_all_mse.loc[:, 'Average'] = data_log_all_mse.loc[:, :].sum(axis=1) / data_log_all_mse.shape[1]

# printing the predicted values
fig, ax = plt.subplots()
ax.plot(lin_space_log['x'], model.predict(lin_space_log[predictors]), color='seagreen', marker='o',
        linestyle='None', markersize=3)
# ax.plot(data['x'], model.predict(data[predictors]), 'ro')
plt.xlabel('Village night lights (shape, log of sum, 2011)')
plt.ylabel('Number of households electrified in village (in log)')
plt.title('Nonlinear relationship estimated via Lasso with alpha = 10e-10')
plt.scatter(data['x'], data['y'], c='cornflowerblue', s=5, alpha=0.5)
plt.show()
plt.savefig(os.path.join(path_figure, 'lasso_CV_log_all.png'))


############  LOG ALL 10^-3 to 10^10
powers = list(range(-15, 0, 1)) + list(range(0, 55, 1))
alphas = [10 ** (k / 5) for k in powers]
data = data_log_all
print(data.columns.values)
skf = StratifiedKFold(data['strat'], n_folds=100, shuffle=True)
print("Computing regularization path using the coordinate descent lasso...")
model = LassoCV(cv=skf, alphas=alphas).fit(data[predictors], data['y'])
print("Model computed.")
dict_coeff['log_all_3'] = model.coef_
alpha['log_all_3'] = model.alpha_

# printing the predicted values
fig, ax = plt.subplots()
ax.plot(lin_space_log['x'], model.predict(lin_space_log[predictors]), color='seagreen', marker='o',
        linestyle='None', markersize=3)
plt.xlabel('Village night lights (shape, log of sum, 2011)')
plt.ylabel('Number of households electrified in village (in log)')
plt.title('Nonlinear relationship estimated via Lasso with alpha = 10e-3')
plt.scatter(data['x'], data['y'], c='cornflowerblue', s=5, alpha=0.5)
plt.show()
plt.savefig(os.path.join(path_figure, 'lasso_CV_log_all_walpha_minus3.png'))


############  LOG ALL for a large alpha 10^5 to 10^10
powers = list(range(50, 55, 1))
alphas = [10 ** (k / 5) for k in powers]
data = data_log_all
print(data.columns.values)
skf = StratifiedKFold(data['strat'], n_folds=100, shuffle=True)
print("Computing regularization path using the coordinate descent lasso...")
model = LassoCV(cv=skf, alphas=alphas).fit(data[predictors], data['y'])
print("Model computed.")
dict_coeff['log_all_large'] = model.coef_
alpha['log_all_large'] = model.alpha_


##################### TABLE WITH COEFFICIENTS FOR THE VARIOUS MODELS
# for latex
predictors_tex = []
for item in predictors:
    if item == 'x':
        item_mod = '$' + item + '$'
    else:
        item_mod = item.replace('_', '^{')
        item_mod = '$' + item_mod + '}$'
    predictors_tex.append(item_mod)



data_coeffs = pd.DataFrame(None, columns={'Regressors', 'one', '$\\alpha = 0$', '$\\alpha = 10^{-10}$',
                                          '$\\alpha = 10^{-3}$', '$\\alpha = 10^{+10}$'})
data_coeffs.loc[:, 'Regressors'] = predictors_tex
data_coeffs.index = data_coeffs['Regressors']
del data_coeffs['Regressors']
data_coeffs = data_coeffs[['one', '$\\alpha = 0$', '$\\alpha = 10^{-10}$', '$\\alpha = 10^{-3}$', '$\\alpha = 10^{+10}$']]

data_coeffs.loc[:, 'one'] = '-'
data_coeffs.loc['$x$', 'one'] = round(coeff_lin[0], 3)
data_coeffs.loc[:, '$\\alpha = 0$'] = coeff_alpha0
data_coeffs.loc[:, '$\\alpha = 10^{-10}$'] = dict_coeff['log_all10']
data_coeffs.loc[:, '$\\alpha = 10^{-3}$'] = dict_coeff['log_all_3']
data_coeffs.loc[:, '$\\alpha = 10^{+10}$'] = dict_coeff['log_all_large']

data_coeffs.iloc[:, 1] = data_coeffs.iloc[:, 1].map('{:,.2g}'.format)
data_coeffs.iloc[:, 2] = data_coeffs.iloc[:, 2].map('{:,.2g}'.format)
data_coeffs.iloc[:, 3] = data_coeffs.iloc[:, 3].map('{:,.2g}'.format)
data_coeffs.iloc[:, 4] = data_coeffs.iloc[:, 4].map('{:,.2g}'.format)

data_coeffs.loc['MSE', 'one'] = round(mse_lin, 4)
data_coeffs.loc['MSE', '$\\alpha = 0$'] = round(mse_alpha0, 4)
data_coeffs.loc['MSE', '$\\alpha = 10^{-10}$'] = round(data_log_all_mse.loc[1.000000e-10, 'Average'], 4)
data_coeffs.loc['MSE', '$\\alpha = 10^{-3}$'] = round(data_log_all_mse.loc[1.000000e-3, 'Average'], 4)
data_coeffs.loc['MSE', '$\\alpha = 10^{+10}$'] = round(data_log_all_mse.loc[1.000000e+10, 'Average'], 4)

# to latex
latex_table = data_coeffs.to_latex(index=True, escape=False).replace('llllll', 'lrrrrr')

latex_table = latex_table.replace('one', '$\\alpha = 0$')

path_tables = '/home/emd/Dropbox/Night Lights Validation (Eugenie, Ryan, Johannes)/Manuscript/Tables/'
with open(os.path.join(path_tables, "lasso_coeffs.tex"), "w") as f:
    f.write(latex_table)






############  LOG POSITIVE
data = data_log_pos
print('log_pos')
print(data.columns.values)
skf = StratifiedKFold(data['strat'], n_folds=20, shuffle=True)
print("Computing regularization path using the coordinate descent lasso...")
model = LassoCV(cv=skf, alphas=alphas).fit(data[predictors], data['y'])
print("Model computed.")
dict_coeff['log_pos'] = model.coef_
alpha['log_pos'] = model.alpha_
fig, ax = plt.subplots()
ax.plot(lin_space_logpos['x'], model.predict(lin_space_logpos[predictors]), color='seagreen', marker='o',
        linestyle='None',
        markersize=3)
# ax.plot(data['x'], model.predict(data[predictors]), 'ro')
plt.xlabel('Village night lights (shape, log of sum, 2011)')
plt.ylabel('Number of households electrified in village (in log)')
plt.title('Nonlinear relationship estimated via Lasso and cross validation (positive values)')
xmin, xmax = -2, 10
plt.xlim(xmin, xmax)
plt.show()
plt.scatter(data['x'], data['y'], c='cornflowerblue', s=5, alpha=0.5)
plt.savefig(os.path.join(path_figure, 'lasso_CV_log_pos.png'))








